In some of my other posts I created a function that computes the phonological similarity between words using the package FuzzyWuzzy in combination with a function I made myself which generates phonological feature tiers for words in Indonesian. The phonological similarity function can be tweaked a bit to create a sound similarity dictionary which is passed a string and returns the word or words in the language which are most similar. This sort of 'sound thesaurus' is fun to play with and could have numerous applications (beyond the finding placename etymology, the purpose for which I originally created it.) The function looks for the most similar word in Indonesian, but it could easily be adapted for English.
In [1]:
#I won't discuss the code below at length because I have discussed it in previous posts
In [2]:
import pandas as pd
import numpy as np
# importing dictionary
kbbi = pd.read_csv("/Users/admin/Desktop/loanwords/clean.kbbi.csv")
kbbi.columns = ['old_index', 'words']
kbbi['words'] = kbbi['words'].apply(lambda x: str(x))
Generating a matrix of phonological features:
In [3]:
def phono_matrix(string):
string_of_matrixes = []
for character in string:
matrix = {}
###populate manner:
### sonorant
if character in ['a','e','i','o','u','y','w','m','N','Y','l','r','h','q']:
matrix['sonorant'] = 'Y'
else:
matrix['sonorant'] = 'N'
###continuant
if character in ['l','r','y','w','a','e','i','o','u','s','z','f']:
matrix['continuant'] = 'Y'
else:
matrix['continuant'] = 'N'
###consonant
if character in ['p','t','k','q','h','c','b','d','g','j','s','z','f','m','n','Y','N','l','r']:
matrix['consonant'] = 'Y'
else:
matrix['consonant'] = 'N'
###syllabic
if character in ['a','e','i','o','u']:
matrix['syllabic'] = 'Y'
else:
matrix['syllabic'] = 'N'
###strident
if character in ['s','j','c']:
matrix['strident'] = 'Y'
else:
matrix['strident'] = 'N'
###populate place: labial, coronal, palatal, velar, glottal
###labial
if character in ['p','m','f','b','w','u','o']:
matrix['labial'] = 'Y'
else:
matrix['labial'] = 'N'
###coronal
if character in ['t','d','n','s','j','c','Y','i','e','r','l']:
matrix['coronal'] = 'Y'
else:
matrix['coronal'] = 'N'
###palatal
if character in ['s','j','c','i','Y','e']:
matrix['palatal'] = 'Y'
else:
matrix['palatal'] = 'N'
###palatal
if character in ['u','k','g','N','o']:
matrix['velar'] = 'Y'
else:
matrix['velar'] = 'N'
###glottal
if character in ['h','q']:
matrix['glottal'] = 'Y'
else:
matrix['glottal'] = 'N'
###glottal
if character in ['h','q']:
matrix['glottal'] = 'Y'
else:
matrix['glottal'] = 'N'
###nasality
###nasal/oral
if character in ['m','n','Y','N']:
matrix['nasal'] = 'Y'
else:
matrix['nasal'] = 'N'
###populate obstruent voicing
###i assume that [voice] is only phonologically active in obstruents
###voiced/voiceless obstruent
if character in ['b','d','g','j']:
matrix['voice'] = 'Y'
else:
matrix['voice'] = 'N'
### populate lateral/rhotic
###lateral
if character == 'l':
matrix['lateral'] = 'Y'
else:
matrix['lateral'] = 'N'
###rhotic
if character == 'r':
matrix['rhotic'] = 'Y'
else:
matrix['rhotic'] = 'N'
###populate vowel height
###I assume at mid is not an active feature
### high
if character in ['i','u']:
matrix['high'] = 'Y'
else:
matrix['high'] = 'N'
### low
if character == 'a':
matrix['low'] = 'Y'
else:
matrix['low'] = 'N'
string_of_matrixes.append(matrix)
return(string_of_matrixes)
In [4]:
# let's build phonological matrixes
kbbi['matrixes'] = kbbi.words.apply(lambda x: phono_matrix(x))
In [5]:
def tier_builder(string_of_matrixes):
features = ['sonorant','consonant','continuant','syllabic','strident','labial','coronal','palatal','velar', 'glottal','nasal','voice','lateral','rhotic']
tier_dictionary = {}
for feature in features:
tier_dictionary[feature] = str()
for matrix in string_of_matrixes:
if matrix[feature] is not None:
tier_dictionary[feature] = tier_dictionary[feature] + matrix[feature]
return(tier_dictionary)
In [6]:
kbbi['tiers'] = kbbi.matrixes.apply(lambda x: tier_builder(x))
In [7]:
from fuzzywuzzy import fuzz, StringMatcher
import difflib
def similarity(word1_tiers,word2_tiers):
features = ['sonorant','continuant','syllabic','strident','labial','coronal','palatal','velar', 'glottal','nasal','voice','lateral','rhotic']
tier_similarity = {}
for feature in features:
tier_similarity[feature] = fuzz.ratio(word1_tiers[feature],word2_tiers[feature])
tier_similarity = pd.Series(tier_similarity)
return(tier_similarity.mean())
In [8]:
def phonosaurus(word,threshold=0): # a higher threshold means you permit more words
matrix = phono_matrix(word)
tier = tier_builder(matrix)
indexes = []
top_score = 0
best_matches = []
for i in range(0,len(kbbi['tiers'])):
tier1 = kbbi['tiers'][i]
ratio = similarity(tier1,tier)
if ratio > top_score and ratio < 100.0:
top_score = ratio
indexes = []
indexes.append(kbbi['words'][i])
elif ratio > top_score-threshold and ratio < top_score:
indexes.append(kbbi['words'][i])
else:
pass
return "Most similar: " + ', '.join(indexes),"Top score: " + str(top_score), "Threshold score: " + str(top_score-threshold)
In [ ]:
In [ ]:
In [ ]:
In [16]:
phonosaurus('ajon')
Out[16]:
In [17]:
phonosaurus('ajon',2) # increasing the threshold allows a few more near matches
Out[17]:
In [11]:
phonosaurus('dodol',5)
Out[11]:
In [ ]: